Relevant time scale: Monthly
Datasets: Electricity generation, Rigs count, GDP, OECD Business Tendency Survey, Percentage of Civilian Workforce Unemployed >15 weeks
# load API key
eia_api_key <- readtext::readtext("eia_api_key.txt") %>%
.$text
fred_api_key <- readtext::readtext("fred_api_key.txt") %>%
.$text
eia_set_key(eia_api_key)
## Key stored successfully in package environment.
fredr_set_key(fred_api_key)
## pct labor force unemloyed for more than 15 weeks
LT_unemployed <- fredr(series_id = "U1RATE", observation_start = as.Date("1973-01-01"))
## yield curve 10year minue 3month
yield_curve <- fredr(series_id = "T10Y3M", observation_start=as.Date('1947-01-01'))
## business tendenc
business_tendenc <- fredr(series_id = "BSCICP03USM665S", observation_start = as.Date("1973-01-01"))
## finding local peaks where m is the number of points on either side of the peak
find_peaks <- function (x, m = 3){
shape <- diff(sign(diff(x, na.pad = FALSE)))
pks <- sapply(which(shape < 0), FUN = function(i){
z <- i - m + 1
z <- ifelse(z > 0, z, 1)
w <- i + m + 1
w <- ifelse(w < length(x), w, length(x))
if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0))
})
pks <- unlist(pks)
pks
}
distance_from_peak <- function(current_position, peaks_vec, time_series_value){
if (current_position > peaks_vec[1]){
distance <- peaks_vec-current_position
closest <- min(distance)
distance_pct <- (time_series_value[current_position] - time_series_value[current_position+closest])/time_series_value[current_position+closest]
distance_pct <- ifelse(is.null(distance_pct) == T, 0, distance_pct)
distance_pct
} else { distance_pct <- 0
distance_pct}
}
business_local_peaks <- find_peaks(business_tendenc$value)
distance_peak <- sapply(1:dim(business_tendenc)[1], function(x){
distance_pct <- distance_from_peak(x, business_local_peaks, business_tendenc$value)
distance_pct
})
business_tendencies <- cbind(business_tendenc, distance_peak)
business_tendencies[is.na(business_tendencies)] <- 0
write.csv(business_tendencies, "business_tendencies.csv")
business_sentiment <- business_tendencies
## Electricity Data
# hourly demand
lower48_hourly_demand <- paste("http://api.eia.gov/series/?series_id=EBA.US48-ALL.D.H&api_key=", eia_api_key, sep = "") %>%
fromJSON()
lower48_hourly_demand_list <- lower48_hourly_demand$series %>%
select(data)%>%
.[[1]]
date_time <- lower48_hourly_demand_list[[1]][,1]
demand <- lower48_hourly_demand_list[[1]][,2]
lower48_hourly_demand_df <- cbind(date_time, demand) %>%
data.frame(stringsAsFactors = F) %>%
mutate(date = as_date(ymd(substr(date_time, 1, 8))),
hour = as.numeric(substr(date_time, 10, 11)))
## all generation
# last updated 4/26
all_electricity_generation_monthly <- eia_series(id = "TOTAL.ELETPUS.M")
all_electricity_generation_monthly_df <- unnest(all_electricity_generation_monthly, cols="data")
all_electricity_generation_monthly_df <- all_electricity_generation_monthly_df %>%
select(series_id, units, value, date, year, month)
all_electricity_generation_monthly_df <- all_electricity_generation_monthly_df %>%
select(series_id, value, date, year, month) %>%
mutate(terawatt_value = value/1000,
units = "Terawatthours",
date = as.yearmon(unique(substr(date, 1,7))))
# hourly generation
lower48_hourly_generation <- eia_series(id = "EBA.US48-ALL.NG.H")
lower48_hourly_generation_df <- unnest(lower48_hourly_generation, cols = "data") %>%
select(series_id, units, value, date, year, month)
# monthly generation from hourly
lower48_monthly_generation_df <- lower48_hourly_generation_df %>%
group_by(year, month) %>%
summarise(value = (sum(value))/1000000,
units = "Terawatthours",
date = unique(substr(date, 1,7)))
lower48_monthly_generation_df$date <- as.yearmon(lower48_monthly_generation_df$date)
# intersecting all states with lower 48 | then calculate prop lower48 to extrapolate retroactively
intersecting_months <- inner_join(lower48_monthly_generation_df, all_electricity_generation_monthly_df, by = "date")
colnames(intersecting_months)[3] <- "lower48_terawatt"
intersecting_months$proportion <- intersecting_months$lower48_terawatt/intersecting_months$terawatt_value
lower48_prop <- fivenum(intersecting_months$proportion)[3]
all_electricity_generation_monthly_extrapolated_df <- all_electricity_generation_monthly_df %>%
mutate(lower48_terawatt = terawatt_value*lower48_prop) %>%
filter(date < "Jul 2015") %>%
select(year, month, lower48_terawatt, units, date)
colnames(all_electricity_generation_monthly_extrapolated_df)[3] <- "value"
lower48_monthly_generation_extrapolated_df <- bind_rows(all_electricity_generation_monthly_extrapolated_df, lower48_monthly_generation_df)
write.csv(lower48_monthly_generation_extrapolated_df, "monthly_electricity.csv")
electricity <- lower48_monthly_generation_extrapolated_df
## crude oil and nat gas rigs in operation
total_rigs <- eia_series(id = "TOTAL.OGNRPUS.M")
total_rigs_df <- unnest(total_rigs, cols = "data") %>%
select(series_id, units, value, date, year, month)
total_rigs_df <- total_rigs_df %>%
arrange(date) %>%
mutate(change_first_order = round((value - lag(value, 1))/lag(value,1),4),
change_second_order = ifelse(lag(change_first_order,1) == 0, 0, (change_first_order - lag(change_first_order, 1))/lag(change_first_order,1)))
write.csv(total_rigs_df, "monthly_rigs_count.csv")
rigs <- total_rigs_df
gdp_complete <- gdp %>%
mutate(date = as.yearmon(substr((Date), 1, 7)),
`GDP (Continuously Compounding)` = `GDP (Continuously Compounding)`*100,
`GDP (YoY % Change)` = `GDP (YoY % Change)`*100,
gdp_change_lagged = lag(`GDP (Continuously Compounding)`, 1)) %>%
filter(date >= "Jan 1973") %>%
select(date, `GDP (YoY % Change)`, `GDP (Continuously Compounding)`, gdp_change_lagged)
electricity_complete <- electricity %>%
mutate(date = as.yearmon(date),
rate_generated_change = (value - lag(value,1))/lag(value,1),
rate_generated_change_secondorder = (rate_generated_change - lag(rate_generated_change,1))/lag(rate_generated_change,1)) %>%
select(date, rate_generated_change, rate_generated_change_secondorder)
rigs_complete <- rigs %>%
mutate(date = as.yearmon(substr(date, 1, 7))) %>%
select(date, change_first_order, change_second_order)
colnames(rigs_complete)[2:3] <- c("rigs_change_first_order", "rigs_change_second_order")
business_sentiment <- business_sentiment %>%
mutate(date = as.yearmon(substr(date, 1, 7)),
sentiment_change_first_order = (value - lag(value,1))/lag(value,1)) %>%
select(date, value, distance_peak, sentiment_change_first_order)
colnames(business_sentiment)[2] <- "business_sentiment"
sentiment_period <- rep(0, dim(business_sentiment)[1])
for (i in 2:dim(business_sentiment)[1]){
sentiment_i = business_sentiment$sentiment_change_first_order[i]
#print(sentiment_i)
sentiment_h = business_sentiment$sentiment_change_first_order[i-1]
#print(sentiment_h)
sentiment_h[is.na(sentiment_h)] <- 0
if(sign(sentiment_i) == sign(sentiment_h)){
sentiment_period[i] = sentiment_period[i-1]+1
}
else{
sentiment_period[i] = 1
}
}
business_sentiment$sentiment_period <- sentiment_period
LT_unemployed_complete <- LT_unemployed %>%
mutate(date = as.yearmon(substr(date, 1, 7)),
rate_change_first_order = (value - lag(value,1))/lag(value,1)) %>%
select(date, value, rate_change_first_order)
colnames(LT_unemployed_complete)[2] <- "LT_unemployment"
rate_change_period <- rep(0, dim(LT_unemployed_complete)[1])
threshold_period <- rep(0, dim(LT_unemployed_complete)[1])
for (i in 2:dim(LT_unemployed_complete)[1]){
sentiment_i = LT_unemployed_complete$rate_change_first_order[i]
#print(sentiment_i)
sentiment_h = LT_unemployed_complete$rate_change_first_order[i-1]
#print(sentiment_h)
sentiment_h[is.na(sentiment_h)] <- 0
if(sign(sentiment_i) == sign(sentiment_h)|sign(sentiment_i)==0){
rate_change_period[i] = rate_change_period[i-1]+1
}
else{
rate_change_period[i] = 1
}
if(LT_unemployed_complete$LT_unemployment[i] <= 1.5){
threshold_period[i] = threshold_period[i-1]+1
}
}
LT_unemployed_complete$threshold_period <- threshold_period
LT_unemployed_complete$rate_change_period <- rate_change_period
yield_curve[is.na(yield_curve)] <- 0
yield_curve <- yield_curve %>%
drop_na() %>%
mutate(date = as.yearmon(substr(date, 1, 7)),
value = ifelse(value < 0, -1, 1)) %>%
group_by(date) %>%
summarise(value = min(value)) %>%
mutate(value = as.factor(value))
colnames(yield_curve)[2] <- "curve"
# inner join all together and to get recession dates
econ_situation_nber_df <- inner_join(gdp_complete, electricity_complete, by = "date")
econ_situation_nber_df <- inner_join(econ_situation_nber_df, rigs_complete, by = "date")
econ_situation_nber_df <- inner_join(econ_situation_nber_df, business_sentiment, by = "date")
econ_situation_nber_df <- inner_join(econ_situation_nber_df, LT_unemployed_complete, by = "date")
NBER_recession_months <- econ_situation_nber_df$date %>%
.[c(11:27, 85:90, 103:119, 211:219, 339:347, 420:438)]
econ_situation_nber_df <- inner_join(econ_situation_nber_df, yield_curve, by = "date")
econ_situation_nber_df <- econ_situation_nber_df %>%
mutate(NBER_recession = ifelse(date %in% NBER_recession_months, "Y", "N"),
NBER_recession_next = as.factor(lead(NBER_recession,1)),
NBER_recession_two = as.factor(lead(NBER_recession,2))
) %>%
as.data.frame() %>%
drop_na()
DT::datatable(econ_situation_nber_df)
Two random forest models were trained. One to predict the probability of a recession the next month, and another for the following month. A previously-trained model was loaded to save time when knitting this document.
# creating test-train data
testIDs <- createDataPartition(econ_situation_nber_df$NBER_recession_next, p = 0.7, list = F)
train_x <- econ_situation_nber_df[testIDs,c(4:17)]
train_y <- econ_situation_nber_df$NBER_recession_next[testIDs]
test_x <- econ_situation_nber_df[-testIDs, c(4:17)]
test_y <- econ_situation_nber_df$NBER_recession_next[-testIDs]
# model tuning
trctrl <- trainControl(method = "repeatedcv",
number = 10, repeats = 5, search = "grid")
mtry <- ncol(train_x)
ntrees <- 101
tunegrid <- expand.grid(.mtry = c(2:mtry))
metric = "Accuracy"
# following can be commented out if loading a pretrained model from file
# parallel processing to speed things up
# random forest model
# cl <- makePSOCKcluster(5)
# registerDoParallel(cl)
# rf_recession <- train(x = train_x, y = train_y, method = "rf", metric = metric, trControl = trctrl, tuneGrid = tunegrid, ntree = ntrees)
# stopCluster(cl)
# saveRDS(rf_recession, "./models/rf_1mth_all.RDS")
rf_recession <- readRDS("./models/rf_1mth_all.RDS")
predict_y <- predict(rf_recession, newdata = test_x)
predict_y_prob <- predict(rf_recession,newdata = test_x, "prob")
predict_y_prob$date <- econ_situation_nber_df$date[-testIDs]
predict_y_prob$actual <- test_y
predict_y_prob$predicted <- predict_y
cm_1mth_all <- confusionMatrix(predict_y, reference = test_y)
saveRDS(cm_1mth_all, "./models/confusion_matrix_1mth_all.RDS")
cm_1mth_all$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.926471e-01 9.614075e-01 9.597140e-01 9.998139e-01 8.970588e-01
## AccuracyPValue McnemarPValue
## 6.366856e-06 1.000000e+00
prob_model <- ggplotly(ggplot(predict_y_prob, aes(x = date, y = Y)) +
geom_point(aes(color = actual)) +
geom_hline(yintercept = 0.50) +
labs(y = "Probability", x = "Date", colour = "NBER",
title = "Probability of a Recession a Month Ahead", subtitle = "Probabilistic Prediction of a Random Forest Model"))
saveRDS(prob_model, "./models/ggplot_prob_model.RDS")
prob_model
# creating test-train data
testIDs <- createDataPartition(econ_situation_nber_df$NBER_recession_two, p = 0.7, list = F)
train_x <- econ_situation_nber_df[testIDs,c(4:17)]
train_y <- econ_situation_nber_df$NBER_recession_two[testIDs]
test_x <- econ_situation_nber_df[-testIDs, c(4:17)]
test_y <- econ_situation_nber_df$NBER_recession_two[-testIDs]
# model tuning
trctrl <- trainControl(method = "repeatedcv",
number = 10, repeats = 5, search = "grid")
mtry <- ncol(train_x)
ntrees <- 101
tunegrid <- expand.grid(.mtry = c(2:mtry))
metric = "Accuracy"
# following can be commented out if loading a pretrained model from file
# parallel processing to speed things up
# random forest model
# cl <- makePSOCKcluster(5)
# registerDoParallel(cl)
# rf_recession <- train(x = train_x, y = train_y, method = "rf", metric = metric, trControl = trctrl, tuneGrid = tunegrid, ntree = ntrees)
# stopCluster(cl)
# saveRDS(rf_recession, "./models/rf_2mth_all.RDS")
rf_recession <- readRDS("./models/rf_2mth_all.RDS")
predict_y <- predict(rf_recession, newdata = test_x)
predict_y_prob <- predict(rf_recession,newdata = test_x, "prob")
predict_y_prob$date <- econ_situation_nber_df$date[-testIDs]
predict_y_prob$actual <- test_y
predict_y_prob$predicted <- predict_y
cm_2mth_all <- confusionMatrix(predict_y, reference = test_y)
saveRDS(cm_2mth_all, "./models/confusion_matrix_2mth_all.RDS")
cm_2mth_all$byClass
## Sensitivity Specificity Pos Pred Value
## 1.0000000 0.9230769 0.9918699
## Neg Pred Value Precision Recall
## 1.0000000 0.9918699 1.0000000
## F1 Prevalence Detection Rate
## 0.9959184 0.9037037 0.9037037
## Detection Prevalence Balanced Accuracy
## 0.9111111 0.9615385
prob_model_2 <- ggplotly(ggplot(predict_y_prob, aes(x = date, y = Y)) +
geom_point(aes(color = actual)) +
geom_hline(yintercept = 0.50) +
labs(y = "Probability", x = "Date", colour = "NBER",
title = "Probability of a Recession 2 Months Ahead", subtitle = "Probabilistic Prediction of a Random Forest Model"))
saveRDS(prob_model_2, "./models/ggplot_prob_model_2.RDS")
prob_model_2
The following selects a decision tree from the random forest model and plots it. tree_func adopted wholesale from here
# https://shiring.github.io/machine_learning/2017/03/16/rf_plot_ggraph
tree_func <- function(final_model,
tree_num) {
# get tree by index
tree <- randomForest::getTree(final_model,
k = tree_num,
labelVar = TRUE) %>%
tibble::rownames_to_column() %>%
# make leaf split points to NA, so the 0s won't get plotted
mutate(`split point` = ifelse(is.na(prediction), `split point`, NA))
# prepare data frame for graph
graph_frame <- data.frame(from = rep(tree$rowname, 2),
to = c(tree$`left daughter`, tree$`right daughter`))
# convert to graph and delete the last node that we don't want to plot
graph <- graph_from_data_frame(graph_frame) %>%
delete_vertices("0")
# set node labels
V(graph)$node_label <- gsub("_", " ", as.character(tree$`split var`))
V(graph)$leaf_label <- as.character(tree$prediction)
V(graph)$split <- as.character(round(tree$`split point`, digits = 2))
# plot
plot <- ggraph(graph, 'dendrogram') +
theme_bw() +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = node_label), na.rm = TRUE, repel = TRUE) +
geom_node_label(aes(label = split), vjust = 2.5, na.rm = TRUE, fill = "white") +
geom_node_label(aes(label = leaf_label, fill = leaf_label), na.rm = TRUE,
repel = TRUE, colour = "white", fontface = "bold", show.legend = FALSE) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_rect(fill = "white"),
panel.border = element_blank(),
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(size = 18))
print(plot)
}
tree_num <- which(rf_recession$finalModel$forest$ndbigtree == min(rf_recession$finalModel$forest$ndbigtree))
tree_plot <- tree_func(final_model = rf_recession$finalModel, 12)
saveRDS(tree_plot, "./models/tree_plot.RDS")